In [314]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import model_selection, metrics, cluster, preprocessing
from sklearn.ensemble import RandomForestRegressor
import statsmodels.formula.api as sm
import copy as cp
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
import random
In [315]:
def wide_to_long(df, new_var_1, new_var_2, old_var_list):
    new_df = df.drop(old_var_list[1:], axis=1).rename(columns = {old_var_list[0]:new_var_2})
    new_df[new_var_1] = old_var_list[0]
    var_list_fl = cp.copy(old_var_list[1:])
    for value in var_list_fl:
        var_list_temp = cp.copy(old_var_list)
        var_list_temp.remove(value)
        df_temp = df.drop(var_list_temp, axis=1).rename(columns = {value:new_var_2})
        df_temp[new_var_1] = value
        new_df = new_df.append(df_temp, ignore_index = True)
    return(new_df)

def long_to_wide(df, headers, values):
    new_df = df.drop([headers, values], axis=1).drop_duplicates()
    merge_cols = new_df.columns.tolist()
    name_list = df[[headers]].drop_duplicates()[headers].tolist()
    for name in name_list:
        temp_df = df.loc[df[headers] == name].rename(columns={values:name}).drop(headers, axis=1)
        new_df = new_df.merge(temp_df, on=merge_cols, how='left')
    return(new_df)
In [316]:
data = pd.read_csv('data_for_models.csv').drop('Unnamed: 0', axis=1)
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')

# add dummies for Christmas and Thanksgiving
data['Christmas'] = 0
data.loc[data.Holiday == 'Christmas', 'Christmas'] = 1

data['Thanksgiving'] = 0
data.loc[data.Holiday == 'Thanksgiving', 'Thanksgiving'] = 1

# add dummy for Store C
data['Type_C'] = 0
data.loc[data.Type == 'C', 'Type_C'] = 1

for i in range(1, 6):
    promo = 'Promotion'+str(i)
    I0_promo = 'I0_' + promo
    data[I0_promo] = data[promo]
    data.loc[np.isnan(data[I0_promo]), I0_promo] = 0

rlist = []
for i in range(len(data)):
    import random
    rlist.append(random.randint(1,101))
data['random_int'] = rlist
In [317]:
max_date = data['Date'].max()
min_date = data['Date'].min()
In [318]:
# missing weekly_sales values
data_temp = pd.DataFrame(pd.pivot_table(data, values='Weekly_Sales', index=['Store', 'Dept'], columns=['Date'])).T
temp_sum = len(data_temp.columns.tolist()) * len(data_temp)
data_temp = pd.DataFrame(data_temp.isnull().sum())
data_temp = data_temp.loc[data_temp[0] != 0]
missing = data_temp[0].sum()
print('Missing ' + str(missing) + ' of ' + str(temp_sum), '\n', 'Percent missing: ' + str(missing/temp_sum * 100))
Missing 54763 of 476333 
 Percent missing: 11.496789011048993
In [319]:
# testing pivto tables
data_temp = pd.DataFrame(pd.pivot_table(data, values='Weekly_Sales', index=['Store', 'Dept', 'Week'], columns=['Year'])).reset_index()
In [320]:
mean_ws = data[['Weekly_Sales', 'Store', 'Dept', 'Year']]\
    .groupby(['Store', 'Dept', 'Year']).mean().reset_index().rename(columns={'Weekly_Sales':'Mean'})

mean_ws = wide_to_long(data_temp, 'Year', 'Weekly_Sales', [2010, 2011, 2012])\
    .merge(mean_ws, on=['Store', 'Dept', 'Year'], how='left')\
    .rename(columns={'Weekly_Sales':'Imp_WS_Mean'})
mean_ws.loc[np.isnan(mean_ws.Imp_WS_Mean), 'Imp_WS_Mean'] = mean_ws['Mean']
In [321]:
ws = data[['Store', 'Dept', 'Date', 'Weekly_Sales', 'Previous_Year_Sales']]

data2 = mean_ws.drop('Mean', axis=1).merge(data.drop(['Weekly_Sales', 'Previous_Year_Sales'], axis=1).drop_duplicates(),
                                          on=['Store', 'Dept', 'Week', 'Year'], how='left')\
                                   #.merge(ws, on=['Store', 'Dept', 'Date'], how='left')
#data2['Imp_WS_0'] = data2['Weekly_Sales']
#data2.loc[np.isnan(data2.Imp_WS_0), 'Imp_WS_0'] = 0

# remove beginnings and ends added on
data2 = data2.loc[(data2.Date >= min_date) & (data2.Date <= min_date)].drop_duplicates()
In [322]:
data2.head()
Out[322]:
Store Dept Week Imp_WS_Mean Year Date Temperature Fuel_Price Promotion1 Promotion2 ... cpi_new Christmas Thanksgiving Type_C I0_Promotion1 I0_Promotion2 I0_Promotion3 I0_Promotion4 I0_Promotion5 random_int
4 1 1 5 24924.50 2010 2010-02-05 42.31 2.572 NaN NaN ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18.0
56 1 2 5 50605.27 2010 2010-02-05 42.31 2.572 NaN NaN ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16.0
108 1 3 5 13740.12 2010 2010-02-05 42.31 2.572 NaN NaN ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 38.0
160 1 4 5 39954.04 2010 2010-02-05 42.31 2.572 NaN NaN ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0
212 1 5 5 32229.38 2010 2010-02-05 42.31 2.572 NaN NaN ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 48.0

5 rows × 40 columns

In [323]:
len(data2)#.loc[np.isnan(data2.Previous_Year_Sales)]
Out[323]:
2955
In [324]:
# these are the number of missing values
data2.isnull().sum()
Out[324]:
Store                    0
Dept                     0
Week                     0
Imp_WS_Mean              0
Year                     0
Date                     0
Temperature              0
Fuel_Price               0
Promotion1            2955
Promotion2            2955
Promotion3            2955
Promotion4            2955
Promotion5            2955
Unemployment             0
IsHoliday                0
Type                     0
Size                     0
size_cat                 0
Imputed_Holiday          0
Holiday                  0
Imputed_Promotion1       0
Imputed_Promotion2       0
Imputed_Promotion3       0
Imputed_Promotion4       0
Imputed_Promotion5       0
Ln_Promotion1            0
Ln_Promotion2            0
Ln_Promotion3            0
Ln_Promotion4            0
Ln_Promotion5            0
cpi_new                  0
Christmas                0
Thanksgiving             0
Type_C                   0
I0_Promotion1            0
I0_Promotion2            0
I0_Promotion3            0
I0_Promotion4            0
I0_Promotion5            0
random_int               0
dtype: int64
In [325]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15,10))

data_temp = data[['Store', 'Date', 'Temperature']].drop_duplicates()
sns.boxplot(ax=axs[0, 0], x = 'Store', y = 'Temperature', data = data_temp)

data_temp = data[['Store', 'Date', 'Unemployment']].drop_duplicates()
sns.boxplot(ax=axs[0, 1], x = 'Store', y = 'Unemployment', data = data_temp)

data_temp = data[['Store', 'Date', 'Fuel_Price']].drop_duplicates()
sns.boxplot(ax=axs[1, 0], x = 'Store', y = 'Fuel_Price', data = data_temp)

data_temp = data[['Store', 'Date', 'cpi_new']].drop_duplicates()
sns.boxplot(ax=axs[1, 1], x = 'Store', y = 'cpi_new', data = data_temp)
Out[325]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b95d4546a0>
In [326]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 41 columns):
Store                  421570 non-null int64
Dept                   421570 non-null int64
Weekly_Sales           421570 non-null float64
Year                   421570 non-null int64
Week                   421570 non-null int64
Date                   421570 non-null datetime64[ns]
Temperature            421570 non-null float64
Fuel_Price             421570 non-null float64
Promotion1             150681 non-null float64
Promotion2             111248 non-null float64
Promotion3             137091 non-null float64
Promotion4             134967 non-null float64
Promotion5             151432 non-null float64
Unemployment           421570 non-null float64
IsHoliday              421570 non-null bool
Type                   421570 non-null object
Size                   421570 non-null int64
size_cat               421570 non-null object
Imputed_Holiday        421570 non-null bool
Holiday                421570 non-null object
Imputed_Promotion1     421570 non-null float64
Imputed_Promotion2     421570 non-null float64
Imputed_Promotion3     421570 non-null float64
Imputed_Promotion4     421570 non-null float64
Imputed_Promotion5     421570 non-null float64
Ln_Promotion1          421479 non-null float64
Ln_Promotion2          415442 non-null float64
Ln_Promotion3          420479 non-null float64
Ln_Promotion4          421570 non-null float64
Ln_Promotion5          421426 non-null float64
Previous_Year_Sales    261541 non-null float64
cpi_new                421570 non-null float64
Christmas              421570 non-null int64
Thanksgiving           421570 non-null int64
Type_C                 421570 non-null int64
I0_Promotion1          421570 non-null float64
I0_Promotion2          421570 non-null float64
I0_Promotion3          421570 non-null float64
I0_Promotion4          421570 non-null float64
I0_Promotion5          421570 non-null float64
random_int             421570 non-null int64
dtypes: bool(2), datetime64[ns](1), float64(26), int64(9), object(3)
memory usage: 126.2+ MB
In [327]:
df = data[['Store', 'cpi_new', 'Date']].drop_duplicates().sort_values(['Date', 'Store'])
filter_date = df.loc[df.Date != df['Date'].min()]['Date'].min()
df = df.loc[df.Date == filter_date]

cpi_list = df[['cpi_new']].drop_duplicates()['cpi_new'].tolist()
group_list = []
for i in range(len(cpi_list)):
    a = 'R' + str(i+1)
    group_list.append(a)

df_temp = df[['cpi_new']].drop_duplicates()
df_temp['region'] = group_list

df = df.merge(df_temp, on='cpi_new', how='left')

data = data.merge(df[['Store', 'region']], on='Store', how = 'left')
In [328]:
df = data[['Store', 'cpi_new', 'Date']].drop_duplicates().sort_values(['Date', 'Store'])
df = df.loc[df.Date == df['Date'].max()]

cpi_list = df[['cpi_new']].drop_duplicates()['cpi_new'].tolist()
group_list = []
for i in range(len(cpi_list)):
    a = 'R' + str(i+1)
    group_list.append(a)

df_temp = df[['cpi_new']].drop_duplicates()
df_temp['region_2'] = group_list

df = df.merge(df_temp, on='cpi_new', how='left')

data = data.merge(df[['Store', 'region_2']], on='Store', how = 'left')
In [329]:
test_temp = data[['Store', 'region', 'region_2']].drop_duplicates()
len(test_temp.loc[test_temp.region != test_temp.region_2])
Out[329]:
0
In [330]:
data = data.drop('region_2', axis=1)
In [331]:
df = data[['Store', 'cpi_new', 'Temperature']]

mycols = ['Mean_Temp', 'Mean_CPI']

mean_temp = data[['Store', 'Temperature']].drop_duplicates().groupby('Store').mean().rename(columns={'Temperature':'Mean_Temp'})
mean_cpi = data[['Store', 'cpi_new']].drop_duplicates().groupby('Store').mean().rename(columns={'cpi_new':'Mean_CPI'})

df = df.merge(mean_temp, on='Store', how='left')\
       .merge(mean_cpi, on='Store', how='left')

min_max_scaler = preprocessing.MinMaxScaler()
data_mm = pd.DataFrame(min_max_scaler.fit_transform(df[mycols].astype(float)))
data_mm.index = df[mycols].index

to_keep = df[mycols].columns.tolist()
to_keep_mm = []
for var in to_keep:
    to_keep_mm.append(var + '_mm')
data_mm.columns = to_keep_mm

df = df.join(data_mm)

cluster_data = df[['Store', 'Mean_Temp_mm', 'Mean_CPI_mm']].drop_duplicates()
In [332]:
# cluster based on mean temperature, unemployment, and fuel price

# Perform PCA
pca=PCA(n_components=None) # Create an instance of the PCA class

# Fit the data to the model
pcaResults=pca.fit_transform(cluster_data.drop('Store', axis=1))
explained_variance=pca.explained_variance_ratio_ # Extract the varience

# use the elbow method
wcss=[]

for i in range(1,11):
    kmeans=cluster.KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=10, random_state=0)
    kmeans.fit(pcaResults)
    wcss.append(kmeans.inertia_)
    
# Plot the WCSS results
plt.plot(range(1,11), wcss)
plt.title('The elbow method')
plt.xlabel('number of clusters')
plt.ylabel('WCSS')
plt.show()
In [333]:
explained_variance = pca.explained_variance_ratio_ # Extract the varience
print(np.around(explained_variance,3) )
[0.698 0.302]
In [334]:
'''
# Let's try 3 clusters

# Perform PCA
pca = PCA(n_components=3) # Now specify 2 components
results = pca.fit_transform(cluster_data.drop('Store', axis=1))

# Show Plot
plt.scatter(results[:,0], results[:,1])
plt.title('PCA Analysis')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
'''
Out[334]:
"\n# Let's try 3 clusters\n\n# Perform PCA\npca = PCA(n_components=3) # Now specify 2 components\nresults = pca.fit_transform(cluster_data.drop('Store', axis=1))\n\n# Show Plot\nplt.scatter(results[:,0], results[:,1])\nplt.title('PCA Analysis')\nplt.xlabel('PC1')\nplt.ylabel('PC2')\nplt.show()\n"
In [335]:
'''
km2 = cluster.KMeans(n_clusters=3)

np.random.seed(1234)
k2cls = km2.fit(cluster_data.drop('Store', axis=1))

cluster_data['cluster'] = k2cls.labels_
'''
Out[335]:
"\nkm2 = cluster.KMeans(n_clusters=3)\n\nnp.random.seed(1234)\nk2cls = km2.fit(cluster_data.drop('Store', axis=1))\n\ncluster_data['cluster'] = k2cls.labels_\n"
In [336]:
'''
data = data.merge(cluster_data[['Store', 'cluster']], on='Store', how='left')

data['cluster_1'] = 0
data.loc[data.cluster == 1, 'cluster_1'] = 1
data['cluster_2'] = 0
data.loc[data.cluster == 2, 'cluster_2'] = 1
'''
Out[336]:
"\ndata = data.merge(cluster_data[['Store', 'cluster']], on='Store', how='left')\n\ndata['cluster_1'] = 0\ndata.loc[data.cluster == 1, 'cluster_1'] = 1\ndata['cluster_2'] = 0\ndata.loc[data.cluster == 2, 'cluster_2'] = 1\n"
In [337]:
data_temp = data[['Date', 'Temperature', 'region']].drop_duplicates()
sns.boxplot(x = 'region', y = 'Temperature', data = data_temp)
Out[337]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b90b1317f0>
In [338]:
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(30,20))
data_temp = data[['Date', 'Weekly_Sales', 'region', 'Type']].drop_duplicates()
data_temp['ln_ws'] = np.log(data_temp['Weekly_Sales'] + 1)

sns.boxplot(ax = axs[0], x = 'region', y = 'Weekly_Sales', hue = 'Type', data = data_temp)
axs[0].set_xlabel = ('Region')
axs[0].set_ylabel = ('Weekly Sales (US$)')

sns.boxplot(ax = axs[1], x = 'region', y = 'ln_ws', hue = 'Type', data = data_temp)
axs[1].set_xlabel = ('Region')
axs[1].set_ylabel = ('Ln Weekly Sales (US$)')
C:\Users\gylk\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log
  This is separate from the ipykernel package so we can avoid doing imports until
C:\Users\gylk\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in log
  This is separate from the ipykernel package so we can avoid doing imports until
In [339]:
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30,40))

for i in range(5):
    promo = 'Promotion' + str(i+1)
    data_temp = data[['Date', promo, 'region', 'Type']].drop_duplicates()
    sns.boxplot(ax = axs[i], x = 'region', y = promo, hue = 'Type', data = data_temp)
    axs[i].set_xlabel = ('Region')
    axs[i].set_ylabel = (promo.replace('tion', 'tion '))
In [340]:
fig, axs = plt.subplots(nrows=15, ncols=5, figsize=(30,50))

for i in range(5):
    promo = 'Promotion' + str(i+1)
    for j in range(15):
        region = 'R' + str(j+1)
        data.loc[data.region == region].plot.scatter(ax=axs[j, i], y='Weekly_Sales', x=promo)
In [341]:
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20,20))

st=['A', 'B', 'C']
for i in range(3):
    data.loc[data.Type == st[i]][['Date', 'region', 'Weekly_Sales']]\
        .drop_duplicates().groupby(['region', 'Date']).mean().unstack().T\
        .reset_index().drop('level_0', axis=1).set_index('Date')\
        .plot(ax=axs[i], legend=False)
    axs[i].set_title('Store Type: ' + st[i])
    axs[i].set_ylabel('Mean Weekly Sales (US$)')

plt.savefig('RegionsSalesType.png')
In [342]:
fig, axs = plt.subplots(nrows=15, ncols=1, figsize=(20,60))

for i in range(15):
    reg = 'R' + str(i+1)
    data.loc[data.region == reg][['Date', 'Dept', 'Weekly_Sales']]\
        .drop_duplicates().groupby(['Dept', 'Date']).mean().unstack().T\
        .reset_index().drop('level_0', axis=1).set_index('Date')\
        .plot(ax=axs[i], legend=False)
    axs[i].set_title('Region ' + str(i+1))
    axs[i].set_ylabel('Mean Weekly Sales (US$)')

plt.savefig('RegionsSalesDept.png')
In [343]:
fig, ax = plt.subplots(figsize=(20,8))

data[['Date', 'region', 'Weekly_Sales']]\
    .drop_duplicates().groupby(['region', 'Date']).mean().unstack().T\
    .reset_index().drop('level_0', axis=1).set_index('Date')\
    .plot(ax=ax, legend=False)
ax.set_ylabel('Mean Weekly Sales (US$)')

plt.savefig('RegionsSales.png')
In [344]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7.5,7.5))
store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
    data.loc[data.Type == store[i]].plot.scatter(ax=ax, x='Previous_Year_Sales', y='Weekly_Sales', s=0.8, c=colours[i])
    
plt.savefig('prevSalesType.png')
In [345]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(30,10))

store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']

for i in range(len(store)):
    data.loc[data.Type == store[i]].plot.scatter(ax=axs[i], x='Previous_Year_Sales', y='Weekly_Sales', s=0.8, c=colours[i])
    axs[i].set_title('Store Type: ' + store[i])
    axs[i].set_xlabel('Weekly Sales Previous Year (US$)', fontsize=12)
    axs[i].set_ylabel('Weekly Sales Current Year (US$)', fontsize=12)

plt.savefig('PrevSalesType.png')
In [346]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(30,10))

store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']

for i in range(len(store)):
    df = data.loc[data.Type == store[i]]
    sns.regplot(ax=axs[i],
            x=df['Weekly_Sales'], y=df['Previous_Year_Sales'], 
            line_kws={'color' : 'red'},
            scatter_kws={'s': 5},
           )
    axs[i].set_title('Store Type: ' + store[i])
    axs[i].set_xlabel('Weekly Sales Previous Year (US$)', fontsize=12)
    axs[i].set_ylabel('Weekly Sales Current Year (US$)', fontsize=12)

plt.savefig('PrevSalesTypeLINE.png')
In [347]:
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20,15))
store = ['A', 'B', 'C']
colours = ['blue', 'orange']
variables = ['Weekly_Sales', 'Previous_Year_Sales']
for i in range(len(store)):
    for j in range(2):
        data.loc[data.Type == store[i]][['Date', variables[j]]].groupby('Date').mean().unstack().T\
        .plot.line(ax=axs[i], y=variables[j], x='Date', c=colours[j])
        axs[i].set_title('Store Type: ' + store[i])
In [348]:
fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(30,30))
store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
    for j in range(5):
        promo = 'Promotion' + str(j+1)
        data.loc[data.Type == store[i]].plot.scatter(ax=axs[j, i],
                                                     x=promo, 
                                                     y='Weekly_Sales', 
                                                     s=0.8, 
                                                     c=colours[i])
In [349]:
'''fig, ax = plt.subplots(figsize=(30,10))
sns.boxplot(ax=ax, x = 'Type', y = 'Weekly_Sales', hue = 'cluster', data = data)'''
Out[349]:
"fig, ax = plt.subplots(figsize=(30,10))\nsns.boxplot(ax=ax, x = 'Type', y = 'Weekly_Sales', hue = 'cluster', data = data)"
In [350]:
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30, 30))
for i in range(5):
    promo = 'Promotion' + str(i+1)
    sns.boxplot(ax= axs[i], x = 'Store', y = promo, hue = 'Holiday', data = data.loc[data.Type == 'A'])
    axs[i].set_title(promo)
In [351]:
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30, 30))
for i in range(5):
    promo = 'Promotion' + str(i+1)
    sns.boxplot(ax= axs[i], x = 'Store', y = promo, hue = 'Holiday', data = data.loc[data.Type == 'B'])
    axs[i].set_title(promo)
In [352]:
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30, 30))
for i in range(5):
    promo = 'Promotion' + str(i+1)
    sns.boxplot(ax= axs[i], x = 'Store', y = promo, hue = 'Holiday', data = data.loc[data.Type == 'C'])
    axs[i].set_title(promo)
In [353]:
data_temp = data[['Weekly_Sales', 
    'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
    'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
    'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5']]\
    .corr()\
    #.sort_values('Weekly_Sales', ascending = False)

axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
Out[353]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b961bb5518>
In [354]:
data_temp = data.loc[data.Type != 'C'][['Weekly_Sales', 
    'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
    'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
    'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5']]\
    .corr()\
    .sort_values('Weekly_Sales', ascending = False)

axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
Out[354]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b95f8f4a58>
In [355]:
data_temp = data.loc[data.Type == 'C'][['Weekly_Sales', 
    'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
    'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
    'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5']]\
    .corr()\
    .sort_values('Weekly_Sales', ascending = False)

axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
Out[355]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b955beadd8>
In [356]:
data_temp = data[['Weekly_Sales', 
    'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
    'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
    'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5',
    'Christmas', 'Thanksgiving']]\
    .corr()\
    .sort_values('Weekly_Sales', ascending = False)

axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
Out[356]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b95d25d390>
In [357]:
# other variables

data_temp = data[['Weekly_Sales', 'Previous_Year_Sales',
    #'cpi_new', 
    'Fuel_Price', 'Temperature', 'Size', 'Type_C',
    'Christmas', 'Thanksgiving', 'IsHoliday', 'Imputed_Holiday', 'Unemployment']]

data_temp['colour_correction'] = data_temp['Weekly_Sales'] *-1

axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp.corr(), annot=True, cmap='RdBu', linewidths=.5)
C:\Users\gylk\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Out[357]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b9066d4c18>

Grouping Departments¶

In [358]:
data.head()
Out[358]:
Store Dept Weekly_Sales Year Week Date Temperature Fuel_Price Promotion1 Promotion2 ... Christmas Thanksgiving Type_C I0_Promotion1 I0_Promotion2 I0_Promotion3 I0_Promotion4 I0_Promotion5 random_int region
0 1 1 24924.50 2010 5 2010-02-05 42.31 2.572 NaN NaN ... 0 0 0 0.0 0.0 0.0 0.0 0.0 18 R1
1 1 2 50605.27 2010 5 2010-02-05 42.31 2.572 NaN NaN ... 0 0 0 0.0 0.0 0.0 0.0 0.0 16 R1
2 1 3 13740.12 2010 5 2010-02-05 42.31 2.572 NaN NaN ... 0 0 0 0.0 0.0 0.0 0.0 0.0 38 R1
3 1 4 39954.04 2010 5 2010-02-05 42.31 2.572 NaN NaN ... 0 0 0 0.0 0.0 0.0 0.0 0.0 9 R1
4 1 5 32229.38 2010 5 2010-02-05 42.31 2.572 NaN NaN ... 0 0 0 0.0 0.0 0.0 0.0 0.0 48 R1

5 rows × 42 columns

In [359]:
# plot department means (by weeks) by store type 
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20,20))
st = ['A', 'B', 'C']
for i in range(3):
    data.loc[data.Type == st[i]][['Week', 'Dept', 'Weekly_Sales']].groupby(['Dept', 'Week']).mean().unstack()\
    .T.reset_index().drop('level_0', axis=1).set_index('Week')\
    .plot(ax=axs[i], legend=False)
    axs[i].set_title(st[i])
In [360]:
data_summary = data[['Type', 'Dept', 'Weekly_Sales']].groupby(['Dept', 'Type']).describe().unstack().T.reset_index()
data_summary = data_summary.loc[data_summary.level_1 == 'max'].append(data_summary.loc[data_summary.level_1 == 'min']).drop('level_0', axis=1).set_index(['Type', 'level_1']).T
In [361]:
st = ['A', 'B', 'C']

for i in range(3):
    data_summary[st[i], 'range'] = data_summary[st[i], 'max'] - data_summary[st[i], 'min']
In [362]:
data_summary = data.loc[data.Week > 45][['Week', 'Type', 'Dept', 'Weekly_Sales']].groupby(['Dept', 'Type', 'Week']).mean().unstack().reset_index()
In [363]:
data_summary.loc[data_summary[('Type', '')] == 'A'].drop(('Type', ''), axis=1).set_index(('Dept', '')).T\
.plot(figsize=(30, 20))
Out[363]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b905bbbf60>
In [364]:
data_temp = data_summary.loc[data_summary[('Type', '')] == 'A'].drop(('Type', ''), axis=1).set_index(('Dept', ''))
In [365]:
data_temp = data_summary.set_index([('Dept', ''), ('Type', '')]).fillna(0)
In [366]:
for i in range(47, 53):
    data_temp[('pct_change', i)] = (data_temp[('Weekly_Sales', int(i))] - data_temp[('Weekly_Sales', int(i-1))])/data_temp[('Weekly_Sales', int(i))] * 100
In [367]:
temp_summary = data_temp['pct_change', ].T.describe().T[['max', 'min']]
temp_summary['range'] = temp_summary['max'] - temp_summary['min']
C:\Users\gylk\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\lib\function_base.py:4406: RuntimeWarning: invalid value encountered in multiply
  x2 = take(ap, indices_above, axis=axis) * weights_above
In [368]:
temp2 = temp_summary[['range']].loc[temp_summary.range != np.inf].unstack()

temp2['xmas_peak'] = 0
temp2.loc[temp2[('range', 'B')] > 100, 'xmas_peak']= 1

#temp2.hist(bins=20, figsize=(20,10))
In [369]:
data = data.merge(temp2.reset_index()[['xmas_peak', 'Dept']].T.reset_index().drop(('Type', ''), axis=1).set_index('level_0').T,
                  on='Dept', how='left')
In [370]:
data[['Date', 'Type', 'xmas_peak', 'Weekly_Sales']].groupby(['Type', 'xmas_peak', 'Date']).mean().unstack().T.plot(figsize=(20,10))
Out[370]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b9061c9a58>
In [371]:
data['model_group'] = 'ab_np'
data.loc[(data.Type_C == 0) & (data.xmas_peak == 1), 'model_group'] = 'ab_p'
data.loc[(data.Type_C == 1), 'model_group'] = 'c'
In [372]:
data['Cluster'] = '1'
data.loc[data.model_group == 'ab_np', 'Cluster'] = '2'
data.loc[data.model_group == 'ab_p', 'Cluster'] = '3'
data[['Date', 'Cluster', 'Weekly_Sales']].groupby(['Cluster', 'Date']).sum().unstack().T.plot(figsize=(20,10))
plt.savefig('modelgroupSalesDate.png')
In [373]:
'''
sns.set(style="ticks")
sns.pairplot(data[['Weekly_Sales', 'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 
                   'Ln_Promotion4', 'Ln_Promotion5', 'model_group']].dropna(), hue='model_group')
'''
Out[373]:
'\nsns.set(style="ticks")\nsns.pairplot(data[[\'Weekly_Sales\', \'Ln_Promotion1\', \'Ln_Promotion2\', \'Ln_Promotion3\', \n                   \'Ln_Promotion4\', \'Ln_Promotion5\', \'model_group\']].dropna(), hue=\'model_group\')\n'
In [374]:
'''
sns.set(style="ticks")
sns.pairplot(data[['Weekly_Sales', 'Promotion1', 'Promotion2', 'Promotion3', 
                   'Promotion4', 'Promotion5', 'model_group']].dropna(), hue='model_group')
'''
Out[374]:
'\nsns.set(style="ticks")\nsns.pairplot(data[[\'Weekly_Sales\', \'Promotion1\', \'Promotion2\', \'Promotion3\', \n                   \'Promotion4\', \'Promotion5\', \'model_group\']].dropna(), hue=\'model_group\')\n'
In [375]:
'''
sns.set(style="ticks")
sns.pairplot(data[['Weekly_Sales', 'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 
                   'Ln_Promotion4', 'Ln_Promotion5', 'Type']].dropna(), hue='Type')
'''
Out[375]:
'\nsns.set(style="ticks")\nsns.pairplot(data[[\'Weekly_Sales\', \'Ln_Promotion1\', \'Ln_Promotion2\', \'Ln_Promotion3\', \n                   \'Ln_Promotion4\', \'Ln_Promotion5\', \'Type\']].dropna(), hue=\'Type\')\n'
In [376]:
# dept 16 does not fit!!    
data.loc[
    (data.model_group == 'ab_np') #&(data.Dept !=16)
    ][['Weekly_Sales', 'Date', 'Dept']].groupby(['Dept', 'Date']).mean().unstack().T\
    .plot(figsize=(20, 15))
Out[376]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b961be4358>
In [377]:
# dept 3 does not fit!!    
data.loc[
    (data.model_group == 'ab_p') #&(data.Dept !=3)
    ][['Weekly_Sales', 'Date', 'Dept']].groupby(['Dept', 'Date']).mean().unstack().T\
    .plot(figsize=(20, 15))
Out[377]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b96183f908>
In [378]:
# only 3 and 16    
data.loc[
    (data.Dept==16) | (data.Dept==3)
    ][['Weekly_Sales', 'Date', 'Dept']].groupby(['Dept', 'Date']).mean().unstack().T\
    .plot(figsize=(10, 7))
Out[378]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b9b129d550>
In [379]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7.5,7.5))
group = ['ab_np', 'ab_p', 'c']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
    data.loc[data.model_group == group[i]].plot.scatter(ax=ax, x='Previous_Year_Sales', y='Weekly_Sales', s=0.8, c=colours[i])
In [380]:
data[['Weekly_Sales', 'model_group']].groupby('model_group').describe()
Out[380]:
Weekly_Sales
count mean std min 25% 50% 75% max
model_group
ab_np 154094.0 25761.277511 29033.271563 -771.90 4336.1825 15850.455 37525.885 293966.05
ab_p 224879.0 10503.680481 15410.568653 -4988.94 1930.0000 5886.070 13481.125 693099.36
c 42597.0 9519.532538 15985.351612 -379.00 131.9900 1149.670 12695.010 112152.35
In [381]:
test = data.loc[data.Type != 'C'][['Dept', 'model_group']].drop_duplicates()
ab_np_list = test.loc[test.model_group == 'ab_np'].sort_values('Dept')['Dept'].tolist()
ab_p_list = test.loc[test.model_group == 'ab_p'].sort_values('Dept')['Dept'].tolist()
In [382]:
print('no peak group: ' + str(len(ab_np_list)), '\n', 'peak group: ' + str(len(ab_p_list)))
no peak group: 35 
 peak group: 46
In [383]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 45 columns):
Store                  421570 non-null int64
Dept                   421570 non-null int64
Weekly_Sales           421570 non-null float64
Year                   421570 non-null int64
Week                   421570 non-null int64
Date                   421570 non-null datetime64[ns]
Temperature            421570 non-null float64
Fuel_Price             421570 non-null float64
Promotion1             150681 non-null float64
Promotion2             111248 non-null float64
Promotion3             137091 non-null float64
Promotion4             134967 non-null float64
Promotion5             151432 non-null float64
Unemployment           421570 non-null float64
IsHoliday              421570 non-null bool
Type                   421570 non-null object
Size                   421570 non-null int64
size_cat               421570 non-null object
Imputed_Holiday        421570 non-null bool
Holiday                421570 non-null object
Imputed_Promotion1     421570 non-null float64
Imputed_Promotion2     421570 non-null float64
Imputed_Promotion3     421570 non-null float64
Imputed_Promotion4     421570 non-null float64
Imputed_Promotion5     421570 non-null float64
Ln_Promotion1          421479 non-null float64
Ln_Promotion2          415442 non-null float64
Ln_Promotion3          420479 non-null float64
Ln_Promotion4          421570 non-null float64
Ln_Promotion5          421426 non-null float64
Previous_Year_Sales    261541 non-null float64
cpi_new                421570 non-null float64
Christmas              421570 non-null int64
Thanksgiving           421570 non-null int64
Type_C                 421570 non-null int64
I0_Promotion1          421570 non-null float64
I0_Promotion2          421570 non-null float64
I0_Promotion3          421570 non-null float64
I0_Promotion4          421570 non-null float64
I0_Promotion5          421570 non-null float64
random_int             421570 non-null int64
region                 421570 non-null object
xmas_peak              421307 non-null float64
model_group            421570 non-null object
Cluster                421570 non-null object
dtypes: bool(2), datetime64[ns](1), float64(27), int64(9), object(6)
memory usage: 142.3+ MB
In [384]:
tidy = data[['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5', 'cpi_new']]\
    .stack()\
    .reset_index().set_index('level_0')\
    .rename(columns={0: 'value','level_1': 'var'})\
    .join(data[['model_group', 'Type', 'Weekly_Sales']])

g = sns.FacetGrid(tidy, row='model_group', col='var',  hue='Type')
g = g.map(plt.scatter, 'value', 'Weekly_Sales', s=0.5)
In [385]:
data['ab_np'] = 0
data.loc[data.model_group == 'ab_np', 'ab_np'] = 1
data['ab_p'] = 0
data.loc[data.model_group == 'ab_p', 'ab_p'] = 1

Regression Models¶

In [386]:
# mean_absolute_percentage_error

def mean_absolute_percentage_error(y_true, y_pred):
    return(np.mean(np.abs((y_true - y_pred)/y_true)) *100)

Linear Regression without Validation¶

In [387]:
# Model Functions
def linear_model(data, y, x_list):
    x_vars = x_list[0]
    for i in range(1, len(x_list)):
        x_vars += '+' + x_list[i]
    function = y + '~' + x_vars
    m1=sm.ols(function, data).fit()
    return(m1)

def lr_and_print(data, y, x_list):
    to_keep = cp.copy(x)
    to_keep.append(y)
    model_data = data[to_keep].dropna()
    m2 = linear_model(model_data, y, x)
    rmse = np.sqrt(metrics.mean_squared_error(model_data[y], m2.fittedvalues))
    print('RSME: ' + str(rmse))
    return(m2)
In [388]:
# run baseline model with only previous sales

y = 'Weekly_Sales'
x = ['Previous_Year_Sales']

m1=lr_and_print(data, y, x)
RSME: 4132.52569614263
In [389]:
# run baseline model with only previous sales

y = 'Weekly_Sales'
x = ['Previous_Year_Sales', 'random_int']

m1b=lr_and_print(data, y, x)
m1b.summary()
RSME: 4132.525158693705
Out[389]:
OLS Regression Results
Dep. Variable: Weekly_Sales R-squared: 0.967
Model: OLS Adj. R-squared: 0.967
Method: Least Squares F-statistic: 3.838e+06
Date: Mon, 10 Jun 2019 Prob (F-statistic): 0.00
Time: 16:18:07 Log-Likelihood: -2.5489e+06
No. Observations: 261541 AIC: 5.098e+06
Df Residuals: 261538 BIC: 5.098e+06
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 268.3696 17.289 15.522 0.000 234.483 302.256
Previous_Year_Sales 0.9940 0.000 2770.410 0.000 0.993 0.995
random_int -0.0723 0.277 -0.261 0.794 -0.615 0.471
Omnibus: 122491.821 Durbin-Watson: 1.576
Prob(Omnibus): 0.000 Jarque-Bera (JB): 67575322.434
Skew: -0.888 Prob(JB): 0.00
Kurtosis: 81.726 Cond. No. 5.94e+04


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.94e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [390]:
f,ax=plt.subplots(1, figsize=(9, 6))
sns.kdeplot(data['Weekly_Sales'], shade=True, ax=ax, label='$y$')
sns.kdeplot(m1.fittedvalues, shade=True, ax=ax, label='$\hat{y}_1$')
#sns.kdeplot(m1b.fittedvalues, shade=True, ax=ax, label='$\hat{y}_2$')

plt.show()

Weekly Sales by Store Type

In [391]:
# sensitivity test on promotion data --> for Types A and B

# models
y = 'Weekly_Sales'
x = ['Promotion3', 'Promotion5', 'random_int']
m2a = lr_and_print(data.loc[data.Type != 'C'], y, x)

x = ['I0_Promotion3', 'I0_Promotion5', 'random_int']
m2b = lr_and_print(data.loc[data.Type != 'C'], y, x)

x = ['Imputed_Promotion3', 'Imputed_Promotion5', 'random_int']
m2c = lr_and_print(data.loc[data.Type != 'C'], y, x)

x = ['Ln_Promotion3', 'Ln_Promotion5', 'random_int']
m2d = lr_and_print(data.loc[data.Type != 'C'], y, x)


#plots
data_temp = data.loc[data.Type != 'C']

f,ax=plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[0], label='$y$')
sns.kdeplot(m2a.fittedvalues, shade=True, ax=ax[0], label='$\hat{y}_1$')

sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[1], label='$y$')
sns.kdeplot(m2b.fittedvalues, shade=True, ax=ax[1], label='$\hat{y}_2$')

sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[2], label='$y$')
sns.kdeplot(m2c.fittedvalues, shade=True, ax=ax[2], label='$\hat{y}_2$')

sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[3], label='$y$')
sns.kdeplot(m2d.fittedvalues, shade=True, ax=ax[3], label='$\hat{y}_2$')

plt.show()
RSME: 24124.112537485296
RSME: 23198.063797295305
RSME: 23137.569275257294
RSME: 22907.21808450546
In [392]:
m2d.summary()
Out[392]:
OLS Regression Results
Dep. Variable: Weekly_Sales R-squared: 0.028
Model: OLS Adj. R-squared: 0.028
Method: Least Squares F-statistic: 3692.
Date: Mon, 10 Jun 2019 Prob (F-statistic): 0.00
Time: 16:18:09 Log-Likelihood: -4.3294e+06
No. Observations: 377845 AIC: 8.659e+06
Df Residuals: 377841 BIC: 8.659e+06
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept -1.474e+04 395.928 -37.241 0.000 -1.55e+04 -1.4e+04
Ln_Promotion3 1018.3782 16.362 62.241 0.000 986.310 1050.447
Ln_Promotion5 3450.7472 48.375 71.333 0.000 3355.934 3545.561
random_int 1.1056 1.279 0.864 0.388 -1.402 3.613
Omnibus: 274322.901 Durbin-Watson: 1.246
Prob(Omnibus): 0.000 Jarque-Bera (JB): 7700709.526
Skew: 3.195 Prob(JB): 0.00
Kurtosis: 24.173 Cond. No. 634.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [393]:
# sensitivity test on promotion data --> for Type C

# models
y = 'Weekly_Sales'
x = ['Promotion2', 'Promotion4', 'random_int']
m3a = lr_and_print(data.loc[data.Type == 'C'], y, x)

x = ['I0_Promotion2', 'I0_Promotion4', 'random_int']
m3b = lr_and_print(data.loc[data.Type == 'C'], y, x)

x = ['Imputed_Promotion2', 'Imputed_Promotion4', 'random_int']
m3c = lr_and_print(data.loc[data.Type == 'C'], y, x)

x = ['Ln_Promotion2', 'Ln_Promotion4', 'random_int']
m3d = lr_and_print(data.loc[data.Type == 'C'], y, x)

# plots
data_temp = data.loc[data.Type == 'C']

f,ax=plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[0], label='$y$')
sns.kdeplot(m3a.fittedvalues, shade=True, ax=ax[0], label='$\hat{y}_1$')

sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[1], label='$y$')
sns.kdeplot(m3b.fittedvalues, shade=True, ax=ax[1], label='$\hat{y}_2$')

sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[2], label='$y$')
sns.kdeplot(m3c.fittedvalues, shade=True, ax=ax[2], label='$\hat{y}_2$')

sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[3], label='$y$')
sns.kdeplot(m3d.fittedvalues, shade=True, ax=ax[3], label='$\hat{y}_2$')

plt.show()
RSME: 15482.254303413758
RSME: 15984.655293356105
RSME: 15944.643713199457
RSME: 15937.817415014186
In [394]:
m3d.summary()
Out[394]:
OLS Regression Results
Dep. Variable: Weekly_Sales R-squared: 0.006
Model: OLS Adj. R-squared: 0.006
Method: Least Squares F-statistic: 84.48
Date: Mon, 10 Jun 2019 Prob (F-statistic): 1.71e-54
Time: 16:18:10 Log-Likelihood: -4.7263e+05
No. Observations: 42597 AIC: 9.453e+05
Df Residuals: 42593 BIC: 9.453e+05
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 7079.5243 242.444 29.201 0.000 6604.329 7554.720
Ln_Promotion2 416.1565 27.933 14.898 0.000 361.408 470.905
Ln_Promotion4 254.5829 49.278 5.166 0.000 157.996 351.170
random_int 2.3534 2.639 0.892 0.372 -2.818 7.525
Omnibus: 20607.107 Durbin-Watson: 1.050
Prob(Omnibus): 0.000 Jarque-Bera (JB): 106032.916
Skew: 2.382 Prob(JB): 0.00
Kurtosis: 9.087 Cond. No. 187.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [395]:
# impact of model group as a dummy

y = 'Weekly_Sales'
x = ['model_group', 'Previous_Year_Sales']
m4a = lr_and_print(data, y, x)

m4a.summary()
RSME: 4115.756196796483
Out[395]:
OLS Regression Results
Dep. Variable: Weekly_Sales R-squared: 0.967
Model: OLS Adj. R-squared: 0.967
Method: Least Squares F-statistic: 2.580e+06
Date: Mon, 10 Jun 2019 Prob (F-statistic): 0.00
Time: 16:18:11 Log-Likelihood: -2.5478e+06
No. Observations: 261541 AIC: 5.096e+06
Df Residuals: 261537 BIC: 5.096e+06
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 844.0360 16.506 51.134 0.000 811.684 876.388
model_group[T.ab_p] -838.4328 18.190 -46.093 0.000 -874.085 -802.781
model_group[T.c] -445.1214 29.526 -15.076 0.000 -502.991 -387.252
Previous_Year_Sales 0.9885 0.000 2611.121 0.000 0.988 0.989
Omnibus: 115054.820 Durbin-Watson: 1.592
Prob(Omnibus): 0.000 Jarque-Bera (JB): 66343121.963
Skew: -0.724 Prob(JB): 0.00
Kurtosis: 81.012 Cond. No. 1.12e+05


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.12e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [396]:
# impact of model group as a dummy

y = 'Weekly_Sales'
x = ['model_group', 'random_int']
m4a = lr_and_print(data, y, x)

m4a.summary()
RSME: 21461.81755853572
Out[396]:
OLS Regression Results
Dep. Variable: Weekly_Sales R-squared: 0.107
Model: OLS Adj. R-squared: 0.107
Method: Least Squares F-statistic: 1.684e+04
Date: Mon, 10 Jun 2019 Prob (F-statistic): 0.00
Time: 16:18:12 Log-Likelihood: -4.8029e+06
No. Observations: 421570 AIC: 9.606e+06
Df Residuals: 421566 BIC: 9.606e+06
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 2.57e+04 79.657 322.674 0.000 2.55e+04 2.59e+04
model_group[T.ab_p] -1.526e+04 70.975 -214.971 0.000 -1.54e+04 -1.51e+04
model_group[T.c] -1.624e+04 117.484 -138.245 0.000 -1.65e+04 -1.6e+04
random_int 1.1349 1.134 1.001 0.317 -1.088 3.358
Omnibus: 312706.725 Durbin-Watson: 1.347
Prob(Omnibus): 0.000 Jarque-Bera (JB): 11735068.230
Skew: 3.189 Prob(JB): 0.00
Kurtosis: 28.048 Cond. No. 226.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [397]:
# impact of model group as a dummy

y = 'Weekly_Sales'
x = ['Type', 'random_int']
m4a = lr_and_print(data, y, x)

m4a.summary()
RSME: 22304.057417017575
Out[397]:
OLS Regression Results
Dep. Variable: Weekly_Sales R-squared: 0.036
Model: OLS Adj. R-squared: 0.036
Method: Least Squares F-statistic: 5176.
Date: Mon, 10 Jun 2019 Prob (F-statistic): 0.00
Time: 16:18:13 Log-Likelihood: -4.8192e+06
No. Observations: 421570 AIC: 9.638e+06
Df Residuals: 421566 BIC: 9.638e+06
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 2.005e+04 77.064 260.183 0.000 1.99e+04 2.02e+04
Type[T.B] -7862.4037 73.154 -107.478 0.000 -8005.783 -7719.025
Type[T.C] -1.058e+04 118.268 -89.456 0.000 -1.08e+04 -1.03e+04
random_int 0.9549 1.179 0.810 0.418 -1.356 3.265
Omnibus: 308624.022 Durbin-Watson: 1.235
Prob(Omnibus): 0.000 Jarque-Bera (JB): 9292452.642
Skew: 3.209 Prob(JB): 0.00
Kurtosis: 25.087 Cond. No. 213.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Linear Regression with Validation¶

In [398]:
# split into test and train sets (last 2 months will be test)

data_temp = data.loc[data.Weekly_Sales != 0]

test_data_og = data_temp.loc[data_temp.Date >= (data['Date'].max()-datetime.timedelta(days=60))]
train_data_og = data_temp.loc[data_temp.Date < (data['Date'].max()-datetime.timedelta(days=60))]
In [399]:
model_variables = ['Weekly_Sales', 'Previous_Year_Sales']

test_data = test_data_og[model_variables].dropna()
test_data.loc[test_data.Weekly_Sales == 0, 'Weekly_Sales'] += 1
train_data = train_data_og[model_variables].dropna()
train_data.loc[train_data.Weekly_Sales == 0, 'Weekly_Sales'] += 1

lr = LinearRegression()
lr.fit(train_data.drop('Weekly_Sales', axis=1), train_data['Weekly_Sales'])
lr_preds = lr.predict(test_data.drop('Weekly_Sales', axis=1))
lr_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, lr_preds))
lr_mape = mean_absolute_percentage_error(test_data['Weekly_Sales'].values, lr_preds)
print(f"RMSE for Linear Regression: {lr_rmse}", '\n', f"MAPE for Linear Regression: {lr_mape}")
RMSE for Linear Regression: 3562.0428006507204 
 MAPE for Linear Regression: 301.84667480164006
In [400]:
model_variables = ['Weekly_Sales', 'Previous_Year_Sales', 'ab_np', 'ab_p']

test_data = test_data_og[model_variables].dropna()
train_data = train_data_og[model_variables].dropna()

lr = LinearRegression()
lr.fit(train_data.drop('Weekly_Sales', axis=1), train_data['Weekly_Sales'])
lr_preds = lr.predict(test_data.drop('Weekly_Sales', axis=1))
lr_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, lr_preds))
lr_mape = mean_absolute_percentage_error(test_data['Weekly_Sales'].values, lr_preds)
print(f"RMSE for Linear Regression: {lr_rmse}", '\n', f"MAPE for Linear Regression: {lr_mape}")
RMSE for Linear Regression: 3570.645861754951 
 MAPE for Linear Regression: 682.8952690322984
In [401]:
model_variables = ['Weekly_Sales', 'Ln_Promotion3', 'Ln_Promotion5']

test_data = test_data_og[model_variables].dropna()
train_data = train_data_og[model_variables].dropna()

lr = LinearRegression()
lr.fit(train_data.drop('Weekly_Sales', axis=1), train_data['Weekly_Sales'])
lr_preds = lr.predict(test_data.drop('Weekly_Sales', axis=1))
lr_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, lr_preds))
lr_mape = mean_absolute_percentage_error(test_data['Weekly_Sales'].values, lr_preds)
print(f"RMSE for Linear Regression: {lr_rmse}", '\n', f"MAPE for Linear Regression: {lr_mape}")
RMSE for Linear Regression: 21415.112171255754 
 MAPE for Linear Regression: 68600.79865409435

Random Forest¶

In [402]:
# random forest regression model on full dataset

data_temp = data[['Weekly_Sales', 'Previous_Year_Sales']].dropna()
data_temp = data_temp.loc[data_temp.Weekly_Sales != 0]

m2=RandomForestRegressor().fit(data_temp[['Previous_Year_Sales']], data_temp['Weekly_Sales'])\
                          .predict(data_temp[['Previous_Year_Sales']])

# results from m2
rf=pd.Series({'R^2':metrics.r2_score(data_temp['Weekly_Sales'], m2),
              'MSE':metrics.mean_squared_error(data_temp['Weekly_Sales'], m2),
              'MAE':metrics.mean_absolute_error(data_temp['Weekly_Sales'], m2),
              'RSME':np.sqrt(metrics.mean_squared_error(data_temp['Weekly_Sales'], m2)),
              'MAPE':mean_absolute_percentage_error(data_temp['Weekly_Sales'], m2)
             })
print(rf)
R^2     9.915264e-01
MSE     4.391564e+06
MAE     9.813958e+02
RSME    2.095606e+03
MAPE    2.379129e+02
dtype: float64
In [403]:
# random forest regression model with cross-validation

train_data = train_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()

m2=RandomForestRegressor().fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])\
                          .predict(test_data[['Previous_Year_Sales']])

# results from m2
rf_cv=pd.Series({'R^2':metrics.r2_score(test_data['Weekly_Sales'], m2),
                 'MSE':metrics.mean_squared_error(test_data['Weekly_Sales'], m2),
                 'MAE':metrics.mean_absolute_error(test_data['Weekly_Sales'], m2),
                 'RSME':np.sqrt(metrics.mean_squared_error(test_data['Weekly_Sales'], m2)),
                 'MAPE':mean_absolute_percentage_error(test_data['Weekly_Sales'], m2)})
print(rf_cv)
R^2     9.566145e-01
MSE     2.085595e+07
MAE     2.286579e+03
RSME    4.566831e+03
MAPE    1.796130e+02
dtype: float64
In [404]:
# random forest regression model with cross-validation

train_data = train_data_og.loc[train_data_og.model_group == 'ab_np'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_np'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()

m2=RandomForestRegressor().fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])\
                          .predict(test_data[['Previous_Year_Sales']])

# results from m2
rf_cv=pd.Series({'R^2':metrics.r2_score(test_data['Weekly_Sales'], m2),
                 'MSE':metrics.mean_squared_error(test_data['Weekly_Sales'], m2),
                 'MAE':metrics.mean_absolute_error(test_data['Weekly_Sales'], m2),
                 'RSME':np.sqrt(metrics.mean_squared_error(test_data['Weekly_Sales'], m2)),
                 'MAPE':mean_absolute_percentage_error(test_data['Weekly_Sales'], m2)})
print(rf_cv)
R^2     9.681180e-01
MSE     2.718239e+07
MAE     2.927088e+03
RSME    5.213674e+03
MAPE    3.726283e+02
dtype: float64
In [405]:
# random forest regression model with cross-validation

train_data = train_data_og.loc[train_data_og.model_group == 'ab_p'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_p'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()

m2=RandomForestRegressor().fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])\
                          .predict(test_data[['Previous_Year_Sales']])

# results from m2
rf_cv=pd.Series({'R^2':metrics.r2_score(test_data['Weekly_Sales'], m2),
                 'MSE':metrics.mean_squared_error(test_data['Weekly_Sales'], m2),
                 'MAE':metrics.mean_absolute_error(test_data['Weekly_Sales'], m2),
                 'RSME':np.sqrt(metrics.mean_squared_error(test_data['Weekly_Sales'], m2)),
                 'MAPE':mean_absolute_percentage_error(test_data['Weekly_Sales'], m2)})
print(rf_cv)
R^2     8.915416e-01
MSE     1.633359e+07
MAE     1.947677e+03
RSME    4.041484e+03
MAPE    1.045010e+02
dtype: float64

Neural Network¶

In [406]:
# can maybe do this by department?
In [407]:
train_data = train_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()

# For more than 2 models, use a loop & dict for this
# Create the models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])
mlp_preds = mlp.predict(test_data[['Previous_Year_Sales']])
mlp_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, mlp_preds))
print(f"RMSE for MLP: {mlp_rmse}" +', ' + f"MAPE: {mean_absolute_percentage_error(test_data[y].values, mlp_preds)}")
RMSE for MLP: 3631.6719015993635, MAPE: 82.91850129610307
In [408]:
# by model_group

y = 'Weekly_Sales'
x = ['Previous_Year_Sales', 'ab_np', 'ab_p']

to_keep = cp.copy(x)
to_keep.append(y)

train_data = train_data_og[to_keep].dropna()
test_data = test_data_og[to_keep].dropna()

# For more than 2 models, use a loop & dict for this
# Create the models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
print(f"RMSE for MLP: {mlp_rmse}" +', ' + f"MAPE: {mean_absolute_percentage_error(test_data[y].values, mlp_preds)}")
RMSE for MLP: 3573.37388934109, MAPE: 101.6626041316233
In [409]:
# splitting by departments (A & B Type only)

promo = ['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5', 
         'cpi_new', 'Fuel_Price', 'Unemployment', 'Temperature', 'IsHoliday', 'Imputed_Holiday', 'Christmas', 'Thanksgiving',
         'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
         'I0_Promotion1', 'I0_Promotion2', 'I0_Promotion3', 'I0_Promotion4', 'I0_Promotion5',
         'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5', 
         'random_int'
        ]
    
y = 'Weekly_Sales'

var = []
rmse_value = []
mape_value = []
for i in range(int(len(promo)+1)):
    x = ['Previous_Year_Sales']
    if i == 0:
        text = 'Previous_Sales_Only'
    else:
        x.append(promo[i-1])
        text = promo[i-1]
    
    to_keep = cp.copy(x)
    to_keep.append(y)

    train_data = train_data_og.loc[train_data_og.model_group == 'ab_np'][to_keep].dropna()
    test_data = test_data_og.loc[test_data_og.model_group == 'ab_np'][to_keep].dropna()

    # For more than 2 models, use a loop & dict for this
    # Create the models
    mlp = MLPRegressor(max_iter=100)
    mlp.fit(train_data[x], train_data[y])
    mlp_preds = mlp.predict(test_data[x])
    mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
    mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
    rmse_value.append(mlp_rmse)
    mape_value.append(mlp_mape)
    var.append(text)
    #print('Done ' + str(i+1))

ab_np_rmse = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ab_np_rmse['Data'] = 'ab_np'
In [410]:
# splitting by departments (A & B Type only)

promo = ['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5', 
         'cpi_new', 'Fuel_Price', 'Unemployment', 'Temperature', 'IsHoliday', 'Imputed_Holiday', 'Christmas', 'Thanksgiving',
         'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
         'I0_Promotion1', 'I0_Promotion2', 'I0_Promotion3', 'I0_Promotion4', 'I0_Promotion5',
         'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5', 
         'random_int'
        ]
   
y = 'Weekly_Sales'

var = []
rmse_value = []
mape_value = []
for i in range(int(len(promo)+1)):
    x = ['Previous_Year_Sales']
    if i == 0:
        text = 'Previous_Sales_Only'
    else:
        x.append(promo[i-1])
        text = promo[i-1]
    
    to_keep = cp.copy(x)
    to_keep.append(y)

    train_data = train_data_og.loc[train_data_og.model_group == 'ab_p'][to_keep].dropna()
    test_data = test_data_og.loc[test_data_og.model_group == 'ab_p'][to_keep].dropna()

    # For more than 2 models, use a loop & dict for this
    # Create the models
    mlp = MLPRegressor(max_iter=100)
    mlp.fit(train_data[x], train_data[y])
    mlp_preds = mlp.predict(test_data[x])
    mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
    mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
    rmse_value.append(mlp_rmse)
    mape_value.append(mlp_mape)
    var.append(text)
ab_p_rmse = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ab_p_rmse['Data'] = 'ab_p'
In [411]:
# splitting by departments (C Type only)

promo = ['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5', 
         'cpi_new', 'Fuel_Price', 'Unemployment', 'Temperature', 'IsHoliday', 'Imputed_Holiday', 'Christmas', 'Thanksgiving',
         'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
         'I0_Promotion1', 'I0_Promotion2', 'I0_Promotion3', 'I0_Promotion4', 'I0_Promotion5',
         'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
         'random_int'
        ]
   
y = 'Weekly_Sales'

var = []
rmse_value = []
mape_value = []
for i in range(int(len(promo)+1)):
    x = ['Previous_Year_Sales']
    if i == 0:
        text = 'Previous_Sales_Only'
    else:
        x.append(promo[i-1])
        text = promo[i-1]
    
    to_keep = cp.copy(x)
    to_keep.append(y)

    train_data = train_data_og.loc[train_data_og.model_group == 'c'][to_keep].dropna()
    test_data = test_data_og.loc[test_data_og.model_group == 'c'][to_keep].dropna()

    # For more than 2 models, use a loop & dict for this
    # Create the models
    mlp = MLPRegressor(max_iter=100)
    mlp.fit(train_data[x], train_data[y])
    mlp_preds = mlp.predict(test_data[x])
    mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
    mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
    rmse_value.append(mlp_rmse)
    mape_value.append(mlp_mape)
    var.append(text)
c_rmse = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
c_rmse['Data'] = 'c'
In [412]:
rmse_data = ab_np_rmse.append(ab_p_rmse).append(c_rmse)
In [413]:
temp = rmse_data.loc[rmse_data.Data == 'ab_np']
temp_abnp = temp.loc[(temp.RMSE == temp['RMSE'].min()) | (temp.Variable == 'Previous_Sales_Only')]
temp = rmse_data.loc[rmse_data.Data == 'ab_p']
temp_abp = temp.loc[(temp.RMSE == temp['RMSE'].min()) | (temp.Variable == 'Previous_Sales_Only')]
temp = rmse_data.loc[rmse_data.Data == 'c']
temp_c = temp.loc[(temp.RMSE == temp['RMSE'].min()) | (temp.Variable == 'Previous_Sales_Only')]

temp_abnp.append(temp_abp).append(temp_c)
Out[413]:
Variable RMSE MAPE Data
0 Previous_Sales_Only 4198.749005 158.929817 ab_np
28 Imputed_Promotion5 4136.734691 100.881653 ab_np
0 Previous_Sales_Only 3321.915894 42.808075 ab_p
28 Imputed_Promotion5 3200.992908 64.248983 ab_p
0 Previous_Sales_Only 2067.662600 38.420405 c
19 I0_Promotion1 2035.721278 44.169696 c
In [436]:
data_temp = pd.read_csv('imputed_data.csv')[['Store', 'Date', 'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5', 'Imputed_Holiday']]
data_temp['Date'] = pd.to_datetime(data_temp['Date'], format='%Y-%m-%d')
test_data = pd.read_csv('test_all.csv')\
    .merge(data[['Dept', 'model_group', 'Type']].drop_duplicates(), on=['Type', 'Dept'], how='left')
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%d/%m/%Y')
test_data = test_data.merge(data_temp, on=['Date', 'Store'], how='left')\
    .drop(['Year_2', 'Unnamed: 0'], axis=1).drop_duplicates()

py = long_to_wide(data[['Store', 'Dept', 'Year', 'Week', 'Weekly_Sales']], 'Year', 'Weekly_Sales').rename(columns = {2011:2012, 2012:2013})
py1 = wide_to_long(py.drop(2010, axis=1), 'Year', 'Previous_Year_Sales', [2012, 2013])

test_data = test_data.merge(py1, on=['Store', 'Dept', 'Year', 'Week'], how='left').rename(columns={'Weekly_Sales to be predicted':'Weekly_Sales'})
In [437]:
group = 'c'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'

link = ['Date', 'Dept', 'Store']
for i in range(len(x)):
    a = x[i]
    link.append(a)

to_keep = cp.copy(link)
to_keep.append(y)

train_temp = data.loc[data.model_group == group][to_keep].dropna()
test_temp = test_data.loc[test_data.model_group == group][link].dropna()

mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])

test_temp['Weekly_Sales'] = mlp_preds

fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')

train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')

train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')

plt.savefig('predcited_sales_' + group + '.png')
In [438]:
group = 'ab_p'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'

link = ['Date', 'Dept', 'Store']
for i in range(len(x)):
    a = x[i]
    link.append(a)

to_keep = cp.copy(link)
to_keep.append(y)

train_temp = data.loc[data.model_group == group][to_keep].dropna()
test_temp = test_data.loc[test_data.model_group == group][link].dropna()

mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])

test_temp['Weekly_Sales'] = mlp_preds

fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')

train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')

train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')

plt.savefig('predcited_sales_' + group + '.png')
In [439]:
group = 'ab_np'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'

link = ['Date', 'Dept', 'Store']
for i in range(len(x)):
    a = x[i]
    link.append(a)

to_keep = cp.copy(link)
to_keep.append(y)

train_temp = data.loc[data.model_group == group][to_keep].dropna()
test_temp = test_data.loc[test_data.model_group == group][link].dropna()

mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])

test_temp['Weekly_Sales'] = mlp_preds

fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')

train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')

train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')

plt.savefig('predcited_sales_' + group + '.png')
In [440]:
group = 'all_w_dummies'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'

link = ['Date', 'Dept', 'Store', 'Type']
dummies = ['ab_np', 'ab_p']
for i in range(len(x)):
    a = x[i]
    link.append(a)

to_keep = cp.copy(link)
to_keep.append(y)

dummies_data = data[['ab_np', 'ab_p', 'Store', 'Type']].drop_duplicates()

train_temp = data[to_keep].dropna().merge(dummies_data, on=['Store', 'Type'], how='left')
test_temp = test_data[link].dropna().merge(dummies_data, on=['Store', 'Type'], how='left')

to_keep += dummies

x += dummies

mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])

test_temp['Weekly_Sales'] = mlp_preds

fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')

train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')

train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')

plt.savefig('predcited_sales_' + group + '.png')
In [442]:
group = 'all_wo_dummies'
x = ['Previous_Year_Sales']
y = 'Weekly_Sales'

link = ['Date', 'Dept', 'Store', 'Type']

for i in range(len(x)):
    a = x[i]
    link.append(a)

to_keep = cp.copy(link)
to_keep.append(y)

train_temp = data[to_keep].dropna()
test_temp = test_data[link].dropna()

mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])

test_temp['Weekly_Sales'] = mlp_preds

fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')

train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')

train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')

plt.savefig('predcited_sales_' + group + '.png')
In [445]:
# previous sales only by model group

y = 'Weekly_Sales'
x = ['Previous_Year_Sales']

var = []
rmse_value = []
mape_value = []

mlp_list = []

to_keep = ['Weekly_Sales', 'Previous_Year_Sales', 'Date', 'Store']

fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(20,10))

#dfs
train_data = train_data_og[to_keep].dropna()
test_data = test_data_og[to_keep].dropna()
    
#models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
    
#extract stats
text=group[i]
rmse_value.append(mlp_rmse)
mape_value.append(mlp_mape)
var.append(text)
    
#figure
data_temp = cp.copy(test_data)
data_temp['Predicted Sales'] = mlp_preds
data_temp[['Weekly_Sales', 'Date']].rename(columns={'Weekly_Sales':'Actual Sales'}).groupby('Date').mean().plot(ax=axs)
data_temp[['Predicted Sales', 'Date']].groupby('Date').mean().plot(ax=axs)
axs.set_title(text)
axs.set_ylabel('Mean Weekly Sales (US$)')
    
ps_only = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ps_only.head()
Out[445]:
Variable RMSE MAPE
0 _ 3795.178289 82.894708
In [428]:
# previous sales only by model group

group = ['ab_np', 'ab_p', 'c']
   
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']

var = []
rmse_value = []
mape_value = []

mlp_list = []

to_keep = ['Weekly_Sales', 'Previous_Year_Sales', 'Date', 'Store']

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20,10))

for i in range(len(group)):
    #dfs
    train_data = train_data_og.loc[train_data_og.model_group == group[i]][to_keep].dropna()
    test_data = test_data_og.loc[test_data_og.model_group == group[i]][to_keep].dropna()
    
    #models
    mlp = MLPRegressor(max_iter=100)
    mlp.fit(train_data[x], train_data[y])
    mlp_preds = mlp.predict(test_data[x])
    mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
    mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
    
    #extract stats
    text=group[i]
    rmse_value.append(mlp_rmse)
    mape_value.append(mlp_mape)
    var.append(text)
    
    #figure
    data_temp = cp.copy(test_data)
    data_temp['Predicted Sales'] = mlp_preds
    data_temp[['Weekly_Sales', 'Date']].rename(columns={'Weekly_Sales':'Actual Sales'}).groupby('Date').mean().plot(ax=axs[i])
    data_temp[['Predicted Sales', 'Date']].groupby('Date').mean().plot(ax=axs[i])
    axs[i].set_title(text)
    axs[i].set_ylabel('Mean Weekly Sales (US$)')
    
ps_only = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ps_only.head()
Out[428]:
Variable RMSE MAPE
0 ab_np 4202.148835 159.517909
1 ab_p 3416.524482 38.392870
2 c 2052.501660 38.843964
In [429]:
# combining rmse values into one file

data_temp = long_to_wide(rmse_data.drop('MAPE', axis=1), 'Data', 'RMSE').set_index('Variable').T

cols = data_temp.columns.tolist()

for col in cols[1:]:
    #data_temp.loc[data_temp[col] > data_temp['Previous_Sales_Only'], col] = np.nan
    data_temp[col] = data_temp[col] - data_temp['Previous_Sales_Only']

data_temp['Previous_Sales_Only'] = 0.0
data_temp = data_temp.T.rename(columns={'ab_np':'diff_ab_np', 'ab_p':'diff_ab_p', 'c':'diff_c'})
rmse_data = long_to_wide(rmse_data.drop('MAPE', axis=1), 'Data', 'RMSE').set_index('Variable').join(data_temp)

rmse_data.to_csv('nn_rmse_results2.csv')
In [430]:
rmse_data.sort_values('diff_ab_p')
Out[430]:
ab_np ab_p c diff_ab_np diff_ab_p diff_c
Variable
Imputed_Promotion5 4136.734691 3200.992908 2044.362988 -62.014314 -120.922986 -23.299613
Imputed_Promotion4 4224.072684 3213.146041 2067.603371 25.323679 -108.769853 -0.059230
Promotion1 4302.087172 3222.726009 2090.623962 103.338167 -99.189885 22.961361
I0_Promotion5 4194.133409 3228.552358 2072.966123 -4.615597 -93.363536 5.303523
Promotion5 4243.735694 3235.528088 2054.954387 44.986688 -86.387806 -12.708213
Imputed_Promotion1 4266.953446 3261.501043 2069.810983 68.204441 -60.414851 2.148383
I0_Promotion1 4193.266017 3285.129760 2035.721278 -5.482988 -36.786134 -31.941323
Ln_Promotion5 4228.262689 3305.410064 2052.013684 29.513684 -16.505830 -15.648916
I0_Promotion4 4240.975752 3312.566387 2070.076232 42.226746 -9.349507 2.413632
Previous_Sales_Only 4198.749005 3321.915894 2067.662600 0.000000 0.000000 0.000000
IsHoliday 4201.170203 3337.008615 2053.463736 2.421198 15.092721 -14.198865
Promotion4 4353.745463 3352.553265 2899.667278 154.996458 30.637371 832.004678
Fuel_Price 4416.681444 3356.093415 2061.384430 217.932438 34.177521 -6.278171
Ln_Promotion4 4193.800906 3363.048413 2051.602287 -4.948099 41.132520 -16.060314
random_int 4207.007460 3366.953838 2058.377595 8.258455 45.037944 -9.285005
Ln_Promotion1 4213.648056 3369.315942 2052.976867 14.899051 47.400048 -14.685734
Unemployment 4251.204567 3373.596542 2060.011932 52.455562 51.680648 -7.650668
Temperature 4190.580121 3377.184066 2055.244264 -8.168884 55.268173 -12.418336
I0_Promotion3 4253.280510 3379.262446 2047.020589 54.531505 57.346553 -20.642011
Ln_Promotion3 4192.121253 3385.035751 2056.196510 -6.627752 63.119857 -11.466091
cpi_new 4207.848277 3386.402040 2051.638644 9.099271 64.486147 -16.023957
I0_Promotion2 4202.640170 3396.617085 2063.008885 3.891165 74.701191 -4.653715
Ln_Promotion2 4205.043814 3399.831554 2053.486224 6.294809 77.915660 -14.176377
Thanksgiving 4288.839431 3408.684989 2061.554094 90.090425 86.769096 -6.108507
Imputed_Promotion2 4251.162379 3415.598423 2122.327947 52.413374 93.682530 54.665347
Imputed_Promotion3 4205.725695 3424.282703 2048.170349 6.976690 102.366810 -19.492251
Imputed_Holiday 4197.028806 3474.743895 2052.598586 -1.720199 152.828001 -15.064014
Christmas 4252.468235 3504.193801 2058.394117 53.719229 182.277907 -9.268483
Promotion3 4462.433932 3568.816999 2166.507371 263.684926 246.901106 98.844770
Promotion2 4339.489061 3757.182421 2142.918114 140.740056 435.266527 75.255513
In [431]:
# testing variable combinations --> ab_np

promo_single = ['Imputed_Promotion5', 'Imputed_Promotion2']

promo = []
for i in range(len(promo_single)):
    x = promo_single[i]
    for j in range(i+1, len(promo_single)):
        y = promo_single[j]
        promo.append([x, y])
            
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']

for i in range(len(promo)):
    to_keep = x + promo[i]
    to_keep.append(y)   
    text = promo[i][0] + ' and ' + promo[i][1]

    train_data = train_data_og.loc[train_data_og.model_group == 'ab_np'][to_keep].dropna()
    test_data = test_data_og.loc[test_data_og.model_group == 'ab_np'][to_keep].dropna()

    mlp = MLPRegressor(max_iter=100)
    mlp.fit(train_data[x], train_data[y])
    mlp_preds = mlp.predict(test_data[x])
    mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
    mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
    print(text, '\t', f"RMSE: {mlp_rmse}", '\t', f"MAPE: {mlp_mape}")
Imputed_Promotion5 and Imputed_Promotion2 	 RMSE: 4235.801933672718 	 MAPE: 160.16620838972253
In [432]:
# testing variable combinations --> ab_p

promo_single = ['Promotion5', 'Promotion1', 'Unemployment', 'Imputed_Holiday']

promo = []
for i in range(len(promo_single)):
    x = promo_single[i]
    for j in range(i+1, len(promo_single)):
        y = promo_single[j]
        promo.append([x, y])
            
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']

for i in range(len(promo)):
    to_keep = x + promo[i]
    to_keep.append(y)   
    text = promo[i][0] + ' and ' + promo[i][1]

    train_data = train_data_og.loc[train_data_og.model_group == 'ab_p'][to_keep].dropna()
    test_data = test_data_og.loc[test_data_og.model_group == 'ab_p'][to_keep].dropna()

    mlp = MLPRegressor(max_iter=100)
    mlp.fit(train_data[x], train_data[y])
    mlp_preds = mlp.predict(test_data[x])
    mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
    mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
    print(text, '\t', f"RMSE: {mlp_rmse}", '\t', f"MAPE: {mlp_mape}")
Promotion5 and Promotion1 	 RMSE: 3331.7741295826318 	 MAPE: 38.409250382086014
Promotion5 and Unemployment 	 RMSE: 3361.2630257964547 	 MAPE: 37.863420873805204
Promotion5 and Imputed_Holiday 	 RMSE: 3367.2153926772503 	 MAPE: 38.00334727136573
Promotion1 and Unemployment 	 RMSE: 3411.0126545280536 	 MAPE: 37.878877916640874
Promotion1 and Imputed_Holiday 	 RMSE: 3406.8155795614716 	 MAPE: 37.6667349844977
Unemployment and Imputed_Holiday 	 RMSE: 3384.271267096651 	 MAPE: 38.54459056744018
In [433]:
# testing variable combinations --> c

promo_single = ['Promotion3', 'Promotion4']

promo = []
for i in range(len(promo_single)):
    x = promo_single[i]
    for j in range(i+1, len(promo_single)):
        y = promo_single[j]
        promo.append([x, y])
            
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']

for i in range(len(promo)):
    to_keep = x + promo[i]
    to_keep.append(y)   
    text = promo[i][0] + ' and ' + promo[i][1]

    train_data = train_data_og.loc[train_data_og.model_group == 'c'][to_keep].dropna()
    test_data = test_data_og.loc[test_data_og.model_group == 'c'][to_keep].dropna()

    mlp = MLPRegressor(max_iter=100)
    mlp.fit(train_data[x], train_data[y])
    mlp_preds = mlp.predict(test_data[x])
    mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
    print(text, f"RMSE for MLP: {mlp_rmse}")
Promotion3 and Promotion4 RMSE for MLP: 3038.573603030861